Bulk RNA-seq data analysis - CD4/CD8 stimulation - Big Data course 2025

This markdown has the results for the analysis of the data used in the practice for the Big Data course 2025. Done by Nathan.

Initial set-up

setwd('~/Desktop/PhD/Others/BMS-BigData-Course-2025/data-analysis-2025/')
library(dplyr)
library(tidyr)
library(ggplot2)
library(knitr)
library(stringr)
library(patchwork)
library(edgeR)
library(clusterProfiler)
library(enrichplot)
library(org.Hs.eg.db)
library(ReactomePA)
library(viridis)
library(ggrepel)
library(purrr)
library(biomaRt)
library(pheatmap)
library(RColorBrewer)
library(reshape2)
library(variancePartition)
library(forcats)
# Define some functions
default_theme <- function(){
  theme_bw() +
    theme(panel.grid = element_blank(),
          panel.border = element_rect(linewidth = 1),
          axis.title = element_text(size = rel(1)),
          axis.title.y.left = element_text(margin = margin(r = 10, unit = 'pt')),
          axis.title.y.right = element_text(margin = margin(l = 10, unit = 'pt')),
          axis.title.x.bottom = element_text(margin = margin(t = 10, unit = 'pt')),
          plot.title = element_text(size = rel(1), hjust = 0.5, face = 'bold'),
          axis.text = element_text(size = rel(1)),
          axis.ticks.length = unit(5, 'pt'),
          strip.background = element_rect(fill = "#f5a9a9", linewidth = 0),
          strip.text = element_text(size = rel(1), color = "black")
    )
}

plotPC <- function(pca_obj, coldata, coord_1, coord_2, color_by) {

  var_explained <- pca$sdev^2 / sum(pca$sdev^2)
  var_explained_df <- tibble(PC = paste0('PC', seq_along(var_explained)),
                             Var_Explained = var_explained)
  
  df <- as.data.frame(cbind(coldata, pca$x))
  
  x_label <- paste0(coord_1, ' (', round(var_explained_df$Var_Explained[var_explained_df$PC == coord_1]*100, digits = 2), '%)')
  y_label <- paste0(coord_2, ' (', round(var_explained_df$Var_Explained[var_explained_df$PC == coord_2]*100, digits = 2), '%)')
  
  p <- ggplot(df, aes(x = !!sym(coord_1), y = !!sym(coord_2), color = as.factor(!!sym(color_by)))) +
    geom_point(size = 3) +
    scale_color_viridis_d() +
    labs(x = x_label, y = y_label) +
    default_theme()
  
  return(p)
  
}

# The next function we define is to create a scree plot
# Makes a Scree plot based on the PCA results, if n_pcs = NULL, plots all available PCs
plotScree <- function(pca, n_pcs = NULL){
  
  if (is.null(n_pcs)){
    n_pcs <- ncol(pca$x)
  }
  
  # Extract variance explained
  var_explained <- pca$sdev^2 / sum(pca$sdev^2)
  cumulative_var <- cumsum(var_explained)
  
  # Create a data frame for plotting
  plot_data <- data.frame(
    PC = seq_along(var_explained),
    Variance_Explained = var_explained,
    Cumulative_Explained = cumulative_var
  )
  
  plot_data <- plot_data[1:n_pcs,]
  
  # Plot
  ggplot(plot_data, aes(x = factor(PC))) +
    geom_bar(aes(y = Variance_Explained), stat = "identity", fill = "#22A884FF") +
    geom_line(aes(y = Cumulative_Explained, group = 1), color = "#2A788EFF", linewidth = 1) +
    geom_point(aes(y = Cumulative_Explained), color = "#2A788EFF", size = 2) +
    scale_y_continuous(
      name = "Fraction of Variance Explained",
      sec.axis = sec_axis(~., name = "Cumulative Explained Variance"),
      expand = c(0,0)
    ) +
    labs(
      title = "Scree Plot",
      x = "Principal Component",
      y = "Fraction of Variance"
    ) +
    default_theme()
  
}

Load the dataset

counts <- read.table('data/counts.tsv')
metadata <- read.table('data/metadata.tsv')

# Check sample names match
all(rownames(metadata) %in% colnames(counts))
[1] TRUE
all(rownames(metadata) == colnames(counts))
[1] TRUE
# Add extra information in the metadata
extra_metadata <- tibble(
    sample = c('R0083', 'A0325', 'A0469', 'A0509', 'A0506', 'R0082'),
    condition = c('CeD', 'Control', 'Control', 'CeD', 'Control', 'CeD'),
    sex = c('F', 'F', 'M', 'F', 'M', 'F'),
    age = c(16, 14, 14, 18, 17, 13)
)

# Change some names for clarity
metadata <- metadata |>
    mutate(
        celltype = if_else(celltype == 'CD4T', 'CD4', 'CD8'),
        stimulation = case_when(
            stimulation == 'beadstim' & celltype == 'CD4' ~ 'CD3_CD28_stimulation',
            stimulation == 'unstim' & celltype == 'CD4' ~ 'unstimulated',
            stimulation == 'stim' & celltype == 'CD8' ~ 'stimulated_CD4_supernatant',
            stimulation == 'unstim' & celltype == 'CD8' ~ 'unstimulated_CD4_supernatant',
            stimulation == 'SAB' & celltype == 'CD8' ~ 'basal_media'
        )
    )

metadata <- left_join(metadata, extra_metadata, by = 'sample')
metadata$full_sample <- paste(metadata$celltype, metadata$stimulation, metadata$condition, sep = ".")
rownames(metadata) <- colnames(counts)

Quality control and exploratory analysis

y <- DGEList(counts = counts,
            samples = metadata,
            genes = rownames(counts))

Remove poorly expressed genes

Here I am using counts > 1000 as cutoff

# Histogram of log(CPM)
cpm <- cpm(y, log = TRUE)
hist(cpm)

# With the following lines we add arbitrary cut-offs to histogram
abline(v = log10(100), col = "blue",lwd = 2)
abline(v = log10(500), col = "purple",lwd = 2)
abline(v = log10(1000), col = "red",lwd = 2)

keep <- filterByExpr(y,
                    group = y$samples$group, # considers the group when filtering - optional
                    min.total.count = 1000) # the number you decided based on the histogram above

y_filtered <- y[keep, , keep.lib.sizes=FALSE]

print(paste0("Number of genes before filtering: ", dim(y)[1]))
[1] "Number of genes before filtering: 22334"
print(paste0("Number of genes after filtering: ", dim(y_filtered)[1]))
[1] "Number of genes after filtering: 3482"

Principal component analysis (PCA)

cpm <- cpm(y_filtered, log = TRUE)
pca <- prcomp(t(cpm), scale. = TRUE)
plotScree(pca, n_pcs = 20)

# Plot the first 2 PCs colored by the condition
p1 <- plotPC(pca,
      coldata = y_filtered$samples,
      coord_1 = 'PC1',
      coord_2 = 'PC2',
      color_by = 'stimulation')

p2 <- plotPC(pca,
      coldata = y_filtered$samples,
      coord_1 = 'PC1',
      coord_2 = 'PC2',
      color_by = 'celltype')

p3 <- plotPC(pca,
      coldata = y_filtered$samples,
      coord_1 = 'PC1',
      coord_2 = 'PC2',
      color_by = 'condition')

print(p1 + p2 + p3)

p4 <- plotPC(pca,
      coldata = y_filtered$samples,
      coord_1 = 'PC1',
      coord_2 = 'PC2',
      color_by = 'sex')

p5 <- plotPC(pca,
      coldata = y_filtered$samples,
      coord_1 = 'PC1',
      coord_2 = 'PC2',
      color_by = 'age')

p6 <- plotPC(pca,
      coldata = y_filtered$samples,
      coord_1 = 'PC1',
      coord_2 = 'PC2',
      color_by = 'sample')

print(p4+p5+p6)

Normalization of read counts

y_filtered <- calcNormFactors(y_filtered)

Variance partition analysis

Alternatively to the PCA, you can use a “Variance Partition analysis” to quantify and interpret the sources of biological and technical variation in your data. This uses the package variancePartition. This uses the TMM normalized counts and a formula with the variables from your metadata you want to check (more on models and formulas will be explained in future steps). You can investigate any variables you want at once here, instead of one by one as you would do in the PCA.

# First we assess correlation between all pairs of variables
form <- ~ condition + celltype + age + sex + stimulation + lib.size
C <- canCorPairs(form, y_filtered$samples)
Warning in canCorPairs(form, y_filtered$samples): Regression model may be problematic.
High colinearity between variables:
  celltype and stimulation
plotCorrMatrix(C)

Now we are going to fit a model in the gene expression data to check how much each variable contributes to the variation, you can choose which variables to include based on the previous plot.

design <- model.matrix(~condition, y_filtered$samples)
v <- voom(y_filtered, design)

# We need to scale some variables so they are all in the same scale and comparable
y_filtered$samples$age <- scale(y_filtered$samples$age)
y_filtered$samples$lib.size_scaled <- scale(y_filtered$samples$lib.size)

# Categorical variables need to be specified with (1|variable)
form <- ~ (1|condition) + (1|sex) + (1|stimulation) + age + lib.size_scaled
varPart <- fitExtractVarPartModel(v, form, y_filtered$samples)

plotVarPart(varPart)

Differential expression analysis

Defining design matrix

# Here it is also important that the categorial variables are factors and the numerical variables are on the same scale
y_filtered$samples$sex <- factor(y_filtered$samples$sex)
y_filtered$samples$full_sample <- factor(y_filtered$samples$full_sample)

# The parameter data in model.matrix() is your metadata (y$samples)
design <- model.matrix(~ 0 + full_sample + sex + age, data = y_filtered$samples)

# Cleaning names for clarirty
colnames(design) <- str_replace_all(colnames(design), "full_sample", "")

knitr::kable(head(design))
CD4.CD3_CD28_stimulation.CeD CD4.CD3_CD28_stimulation.Control CD4.unstimulated.CeD CD4.unstimulated.Control CD8.basal_media.CeD CD8.basal_media.Control CD8.stimulated_CD4_supernatant.CeD CD8.stimulated_CD4_supernatant.Control CD8.unstimulated_CD4_supernatant.CeD CD8.unstimulated_CD4_supernatant.Control sexM age
X10_R0083_CD4T_bead_stim 1 0 0 0 0 0 0 0 0 0 0 0.3651484
X11_A0325_CD4T_bead_stim 0 1 0 0 0 0 0 0 0 0 0 -0.7302967
X12_A0469_CD4T_bead_stim 0 1 0 0 0 0 0 0 0 0 1 -0.7302967
X13_A0509_CD8_stim 0 0 0 0 0 0 1 0 0 0 0 1.4605935
X14_A0506_CD8_stim 0 0 0 0 0 0 0 1 0 0 1 0.9128709
X15_R0082_CD8_stim 0 0 0 0 0 0 1 0 0 0 0 -1.2780193

Contrasts matrix for CD4 T cells.

contrasts_matrix <- makeContrasts(
    # Effect of CD3/CD28 stimulation on CD4 control
    CD4_stim_control = CD4.CD3_CD28_stimulation.Control - CD4.unstimulated.Control,
    # Effect of CD3/CD28 stimulation on CD4 CeD
    CD4_stim_CeD = CD4.CD3_CD28_stimulation.CeD - CD4.unstimulated.CeD,
    # Effect of CD3/CD28 stimulation on CD4 (combined)
    CD4_stim_combined = (CD4.CD3_CD28_stimulation.Control + CD4.CD3_CD28_stimulation.CeD)/2 - (CD4.unstimulated.Control + CD4.unstimulated.CeD)/2,
    # What is the difference between CD4 CeD stimulated and CD4 Control stimulated?
    Diff_CD4_stim = CD4.CD3_CD28_stimulation.CeD - CD4.CD3_CD28_stimulation.Control,
    # What is the difference between CD4 CeD stimulated and CD4 Control stimulated?
    Diff_CD4_unstim = CD4.unstimulated.CeD - CD4.unstimulated.Control,
    levels = colnames(design)
)
knitr::kable(contrasts_matrix)
CD4_stim_control CD4_stim_CeD CD4_stim_combined Diff_CD4_stim Diff_CD4_unstim
CD4.CD3_CD28_stimulation.CeD 0 1 0.5 1 0
CD4.CD3_CD28_stimulation.Control 1 0 0.5 -1 0
CD4.unstimulated.CeD 0 -1 -0.5 0 1
CD4.unstimulated.Control -1 0 -0.5 0 -1
CD8.basal_media.CeD 0 0 0.0 0 0
CD8.basal_media.Control 0 0 0.0 0 0
CD8.stimulated_CD4_supernatant.CeD 0 0 0.0 0 0
CD8.stimulated_CD4_supernatant.Control 0 0 0.0 0 0
CD8.unstimulated_CD4_supernatant.CeD 0 0 0.0 0 0
CD8.unstimulated_CD4_supernatant.Control 0 0 0.0 0 0
sexM 0 0 0.0 0 0
age 0 0 0.0 0 0

Contrasts matrix for CD8 T cells.

contrasts_matrix_cd8 <- makeContrasts(
    # Effect of stimulated CD4 T cells supernatant on CD8 cells (compared to basal media) - Global effect
    CD8_stim_combined = (CD8.stimulated_CD4_supernatant.CeD + CD8.stimulated_CD4_supernatant.Control)/2 - (CD8.basal_media.CeD + CD8.basal_media.Control)/2,
    # Effect of unstimulated CD4 T cells supernatant on CD8 cells (compared to basal media) - Global effect
    CD8_unstim_combined = (CD8.unstimulated_CD4_supernatant.CeD + CD8.unstimulated_CD4_supernatant.Control)/2 - (CD8.basal_media.CeD + CD8.basal_media.Control)/2,
    # Effect of stimulated CD4 T cells supernatant on CD8 cells (compared to unstimulated) - Global effect
    CD8_stim_vs_unstim_combined = (CD8.stimulated_CD4_supernatant.CeD + CD8.stimulated_CD4_supernatant.Control)/2 - (CD8.unstimulated_CD4_supernatant.CeD + CD8.unstimulated_CD4_supernatant.Control)/2,

    # Effect of stimulated CD4 T cells supernatant on CD8 cells (compared to basal media) - Controls
    CD8_stim_control = CD8.stimulated_CD4_supernatant.Control - CD8.basal_media.Control,
    # Effect of unstimulated CD4 T cells supernatant on CD8 cells (compared to basal media) - Controls
    CD8_unstim_control = CD8.unstimulated_CD4_supernatant.Control - CD8.basal_media.Control,
    # Effect of stimulated CD4 T cells supernatant on CD8 cells (compared to unstimulated) - Controls
    CD8_stim_vs_unstim_control = CD8.stimulated_CD4_supernatant.Control - CD8.unstimulated_CD4_supernatant.Control,

    # Effect of stimulated CD4 T cells supernatant on CD8 cells (compared to basal media) - CeD
    CD8_stim_CeD = CD8.stimulated_CD4_supernatant.CeD - CD8.basal_media.CeD,
    # Effect of unstimulated CD4 T cells supernatant on CD8 cells (compared to basal media) - CeD
    CD8_unstim_CeD = CD8.unstimulated_CD4_supernatant.CeD - CD8.basal_media.CeD,
    # Effect of stimulated CD4 T cells supernatant on CD8 cells (compared to unstimulated) - CeD
    CD8_stim_vs_unstim_CeD = CD8.stimulated_CD4_supernatant.CeD - CD8.unstimulated_CD4_supernatant.CeD,

    # What is the difference between CD8 CeD + stimulated supernatant and control?
    Diff_CD8_stim = CD8.stimulated_CD4_supernatant.CeD - CD8.stimulated_CD4_supernatant.Control,
    # What is the difference between CD8 CeD + unstimulated supernatant and control?
    Diff_CD8_unstim = CD8.stimulated_CD4_supernatant.CeD - CD8.stimulated_CD4_supernatant.Control,

    levels = colnames(design)
)
knitr::kable(contrasts_matrix_cd8)
CD8_stim_combined CD8_unstim_combined CD8_stim_vs_unstim_combined CD8_stim_control CD8_unstim_control CD8_stim_vs_unstim_control CD8_stim_CeD CD8_unstim_CeD CD8_stim_vs_unstim_CeD Diff_CD8_stim Diff_CD8_unstim
CD4.CD3_CD28_stimulation.CeD 0.0 0.0 0.0 0 0 0 0 0 0 0 0
CD4.CD3_CD28_stimulation.Control 0.0 0.0 0.0 0 0 0 0 0 0 0 0
CD4.unstimulated.CeD 0.0 0.0 0.0 0 0 0 0 0 0 0 0
CD4.unstimulated.Control 0.0 0.0 0.0 0 0 0 0 0 0 0 0
CD8.basal_media.CeD -0.5 -0.5 0.0 0 0 0 -1 -1 0 0 0
CD8.basal_media.Control -0.5 -0.5 0.0 -1 -1 0 0 0 0 0 0
CD8.stimulated_CD4_supernatant.CeD 0.5 0.0 0.5 0 0 0 1 0 1 1 1
CD8.stimulated_CD4_supernatant.Control 0.5 0.0 0.5 1 0 1 0 0 0 -1 -1
CD8.unstimulated_CD4_supernatant.CeD 0.0 0.5 -0.5 0 0 0 0 1 -1 0 0
CD8.unstimulated_CD4_supernatant.Control 0.0 0.5 -0.5 0 1 -1 0 0 0 0 0
sexM 0.0 0.0 0.0 0 0 0 0 0 0 0 0
age 0.0 0.0 0.0 0 0 0 0 0 0 0 0

Fit the model and find DEGs

v <- voom(y_filtered, design, plot=TRUE) # change to FALSE if you don't want the plot

# Fit the model
fit <- lmFit(v, design)

res_all_cd4 <- list()
for (x in colnames(contrasts_matrix)) {
  
  contr <- contrasts_matrix[, x]
  
  fit2 <- contrasts.fit(fit, contr)
  fit2 <- eBayes(fit2)
  res <- topTable(fit2, sort.by = "P", n = Inf)
  res$Contrast <- x
  res_all_cd4 <- append(res_all_cd4, list(res))
    
}
results_cd4 <- list_rbind(res_all_cd4)

res_all_cd8 <- list()
for (x in colnames(contrasts_matrix_cd8)) {
  
  contr <- contrasts_matrix_cd8[, x]
  
  fit2 <- contrasts.fit(fit, contr)
  fit2 <- eBayes(fit2)
  res <- topTable(fit2, sort.by = "P", n = Inf)
  res$Contrast <- x
  res_all_cd8 <- append(res_all_cd8, list(res))
    
}
results_cd8 <- list_rbind(res_all_cd8)

This list have the genes with their Ensembl ID, to help with interpretation we will add the gene symbol and entrezid annotation before we continue.

mart <- useEnsembl(biomart = "ensembl", dataset = "hsapiens_gene_ensembl")
genemap <- getBM(attributes = c("ensembl_gene_id", "hgnc_symbol", "entrezgene_id"),
                 filters = "ensembl_gene_id",
                 values = unique(results_cd4$genes),
                 mart = mart)
results_cd4 <- left_join(results_cd4, genemap, by = join_by("genes" == "ensembl_gene_id"), relationship = 'many-to-many')

genemap <- getBM(attributes = c("ensembl_gene_id", "hgnc_symbol", "entrezgene_id"),
                 filters = "ensembl_gene_id",
                 values = unique(results_cd8$genes),
                 mart = mart)
results_cd8 <- left_join(results_cd8, genemap, by = join_by("genes" == "ensembl_gene_id"), relationship = 'many-to-many')

Saving the results. Considering significant genes with |log2FC| > 1 and adj. p-value < 0.05.

results_significant_cd4 <- results_cd4 |>
    dplyr::filter(abs(logFC) > 1 & adj.P.Val < 0.05)

results_significant_cd8 <- results_cd8 |>
    dplyr::filter(abs(logFC) > 1 & adj.P.Val < 0.05)

# Save the results
# Specify the path to the folder you want to save the results
folder_path <- "results"

# Check if the folder exists
if (!dir.exists(folder_path)) {
  # If it doesn't exist, create it
  dir.create(folder_path)
  cat(paste0("Folder '", folder_path, "' created successfully!\n"))
} else {
  cat(paste0("Folder '", folder_path, "' already exists.\n"))
}
Folder 'results' already exists.
write.csv(results_cd4, paste0(folder_path, "/DEG_results_CD4T_cells.csv"), row.names=F)
write.csv(results_significant_cd4, paste0(folder_path, "/DEG_results_CD4T_cells_significant_only.csv"), row.names=F)

write.csv(results_cd8, paste0(folder_path, "/DEG_results_CD8T_cells.csv"), row.names=F)
write.csv(results_significant_cd8, paste0(folder_path, "/DEG_results_CD8T_cells_significant_only.csv"), row.names=F)
# Define a function to make a volcano plot
# Makes a volcano plot, highlighting the genes above the specified log2FC threshold and labels the top n genes (based on padj)
VolcanoPlot <- function(res, threshold = 0.5, p_cutoff = 0.05, n_labels = 15, title = '') {
    # Prepare the data
  res_plot <- res |>
    mutate(DE = case_when(
      adj.P.Val < p_cutoff & logFC >= threshold ~ 'upregulated',
      adj.P.Val < p_cutoff & logFC <= -threshold ~ 'downregulated',
      adj.P.Val >= p_cutoff | abs(logFC) < threshold ~ 'not DE'
    ))
  
  # Add label to the top n genes (ranked by FC)
  n <- n_labels
  
  top_genes <- res_plot |> dplyr::filter(DE != 'not DE') |> 
    group_by(sign(logFC)) |> 
    top_n(n_labels, -log10(adj.P.Val)) |> 
    pull(hgnc_symbol)
  res_plot$genelabel <- ''
  res_plot$genelabel[res_plot$hgnc_symbol %in% top_genes] <- res_plot$hgnc_symbol[res_plot$hgnc_symbol %in% top_genes]
  
  color_map <- c("not DE" = "gray", "downregulated" = "#2166ac", "upregulated" = "#b2182b")
  
  ggplot(res_plot, aes(x = logFC, y = -log10(adj.P.Val))) +
    geom_point(aes(colour = DE), size = 1.5) +
    geom_text_repel(aes(label = genelabel), min.segment.length = 0.25, force = 10) +
    geom_hline(yintercept = -log10(p_cutoff), colour = '#696969', linetype = 'dashed') +
    geom_vline(xintercept = -threshold, colour = '#696969', linetype = 'dashed') +
    geom_vline(xintercept = threshold, colour = '#696969', linetype = 'dashed') +
    scale_color_manual('', values = color_map) +
    default_theme() +
    theme(legend.text = element_text(size = rel(1.25))) +
    labs(title = title,
         x = 'log2 fold change',
         y = '-log10 adjusted p-value')
  
}

CD4 T cells

volcanos_cd4 <- list()
for(c in unique(results_cd4$Contrast)) {
    data <- dplyr::filter(results_cd4, Contrast == c)
    p <- VolcanoPlot(data, threshold = 1, p_cutoff = 0.05, n_labels = 15, title = c)
    volcanos_cd4 <- append(volcanos_cd4, list(p))
}

wrap_plots(volcanos_cd4, guides = 'collect', axis_titles = 'collect')
Warning: Removed 27 rows containing missing values or values outside the scale range
(`geom_text_repel()`).
Removed 27 rows containing missing values or values outside the scale range
(`geom_text_repel()`).
Removed 27 rows containing missing values or values outside the scale range
(`geom_text_repel()`).
Warning: ggrepel: 4 unlabeled data points (too many overlaps). Consider
increasing max.overlaps
Warning: ggrepel: 7 unlabeled data points (too many overlaps). Consider increasing max.overlaps
ggrepel: 7 unlabeled data points (too many overlaps). Consider increasing max.overlaps

How to interpret the comparisons:

Contrast Meaning
CD4_stim_control Effect of CD3/CD28 stimulation on CD4 control
CD4_stim_CeD Effect of CD3/CD28 stimulation on CD4 CeD
CD4_stim_combined Effect of CD3/CD28 stimulation on CD4 (CeD and controls combined)
Diff_CD4_stim The difference between CD4 CeD stimulated and CD4 Control stimulated
Diff_CD4_unstim The difference between CD4 CeD unstimulated and CD4 Control unstimulated

The heatmap below show the expression of all these DEGs.

de_genes <- results_significant_cd4$genes |> unique()

# Choose samples to show - only CD4 T samples
samples <- y_filtered$samples |>
    filter(celltype == 'CD4')
samples_to_plot <- rownames(samples)

# Get the logCPM counts for these genes
cpm_de <- cpm(y_filtered, log = TRUE)
cpm_de <- cpm_de[de_genes, samples_to_plot]

# Calculates z-score
z <- t(scale(t(cpm_de)))

# Create annotation dataframe for the heatmap - this can be whatever you want to show about the samples (columns)
annotation_col <- y_filtered$samples[colnames(z), c('stimulation', 'condition')]
ann_colors <- list(
    condition = c('Control' = '#29d3d6',
                 'CeD' = '#910a0a'),
    stimulation = c('CD3_CD28_stimulation' = '#FEE5D9',
                   'unstimulated' = '#FCAE91'))

# Make the heatmap with values scaled by row (genes), so you can compare between samples (columns)
pheatmap(z, cluster_rows = T, show_rownames = F, show_colnames = F, border_color = NA,
        color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(100),
        annotation_col = annotation_col,
        annotation_colors = ann_colors,
        treeheight_col = FALSE)

Checking enriched pathways for the CD4 comparisons

# Define the function to create the ranked gene list and run the analysis
run_gsea <- function(data, contrast, database = c('GO-BP', 'KEGG', 'Reactome')) {

  # Making sure there are no duplicated entrezid codes
  data <- data |> filter(Contrast == contrast) |> distinct(entrezgene_id, .keep_all = TRUE)
  
  # Creates the ordered gene list for given contrast
  foldchanges <- data$logFC
  names(foldchanges) <- data$entrezgene_id
  foldchanges <- sort(foldchanges, decreasing = TRUE)

  # Run the analysis for given database
  if(database == 'GO-BP') {
    res <- gseGO(geneList = foldchanges, OrgDb = "org.Hs.eg.db", keyType = "ENTREZID", ont = "BP", pvalueCutoff = 0.05)
  } else if(database == 'KEGG') {
    res <- gseKEGG(geneList = foldchanges, organism = 'hsa', pvalueCutoff = 0.05)
  } else if(database == 'Reactome') {
    res <- gsePathway(geneList = foldchanges, organism = 'human', pvalueCutoff = 0.05)
  } else {
    print("Choose a database among GO-BP, KEGG or Reactome.")
  }
  return(res)

}

# Function to plot the results
plot_gsea <- function(data, contrast, n = 10) {

  data_plot <- data[[contrast]]
  data_plot <- as.data.frame(data_plot@result)

  data_plot <- data_plot |>
    arrange(desc(abs(NES))) |>
    group_by(sign(NES)) |>
    dplyr::slice(1:n)

    p <- ggplot(data_plot, showCategory = n*2, aes(NES, fct_reorder(Description,NES), fill = -log10(p.adjust))) +
      geom_col() +
      geom_vline(xintercept = 0, colour = '#696969', linetype = 'dashed') +
      scale_fill_gradientn(colours=c("#b3eebe", "#46bac2", "#371ea3"),
                            guide=guide_colorbar(reverse=TRUE)) +
      labs(title = contrast,
          x = 'Normalized Enrichment Score',
          y = '') +
      default_theme() +
      theme(text = element_text(size = 11))

    return(p)
}

Apply the funcion iteratively and plot the results.

contrasts <- unique(results_significant_cd4$Contrast)

# Remove NA entrezid from the results
data_cd4 <- results_significant_cd4 |> filter(!is.na(entrezgene_id))

# GO-BP
gsea_cd4_go <- map(contrasts, run_gsea, data = data_cd4, database = 'GO-BP')
using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
preparing geneSet collections...
GSEA analysis...
Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.26% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.
Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
minSize, : For some pathways, in reality P-values are less than 1e-10. You can
set the `eps` argument to zero for better estimation.
leading edge analysis...
done...
using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
preparing geneSet collections...
GSEA analysis...
Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.22% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.
leading edge analysis...
done...
using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
preparing geneSet collections...
GSEA analysis...
leading edge analysis...
done...
names(gsea_cd4_go) <- contrasts
gsea_cd4_go_plots <- map(contrasts, plot_gsea, data = gsea_cd4_go, n = 10)
wrap_plots(gsea_cd4_go_plots, ncol = 1) + plot_annotation(title = 'GO-BP')

# KEGG
gsea_cd4_kegg <- map(contrasts, run_gsea, data = data_cd4, database = 'KEGG')
Reading KEGG annotation online: "https://rest.kegg.jp/link/hsa/pathway"...
Reading KEGG annotation online: "https://rest.kegg.jp/list/pathway/hsa"...
using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
preparing geneSet collections...
GSEA analysis...
Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.26% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.
leading edge analysis...
done...
using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
preparing geneSet collections...
GSEA analysis...
Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.22% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.
leading edge analysis...
done...
using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
preparing geneSet collections...
GSEA analysis...
leading edge analysis...
done...
names(gsea_cd4_kegg) <- contrasts
gsea_cd4_kegg_plots <- map(contrasts, plot_gsea, data = gsea_cd4_kegg, n = 10)
wrap_plots(gsea_cd4_kegg_plots, ncol = 1) + plot_annotation(title = 'KEGG')

# ReactomePA
gsea_cd4_pa <- map(contrasts, run_gsea, data = data_cd4, database = 'Reactome')
using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
preparing geneSet collections...
GSEA analysis...
Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.26% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.
leading edge analysis...
done...
using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
preparing geneSet collections...
GSEA analysis...
Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.22% of the list).
The order of those tied genes will be arbitrary, which may produce unexpected results.
leading edge analysis...
done...
using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
preparing geneSet collections...
GSEA analysis...
leading edge analysis...
done...
names(gsea_cd4_pa) <- contrasts
gsea_cd4_pa_plots <- map(contrasts, plot_gsea, data = gsea_cd4_pa, n = 10)
wrap_plots(gsea_cd4_pa_plots, ncol = 1) + plot_annotation(title = 'Reactome')

Checking overlap between DEGs in Control and CeD. Are there any CeD-specific gene or are the fold-changes different in CeD for overlapping genes?

cd4_res_control <- results_significant_cd4 |> filter(Contrast == 'CD4_stim_control' & !is.na(hgnc_symbol)) |> distinct(genes, .keep_all = TRUE)
cd4_res_ced <- results_significant_cd4 |> filter(Contrast == 'CD4_stim_CeD' & !is.na(hgnc_symbol)) |> distinct(genes, .keep_all = TRUE)

library(ggvenn)
Loading required package: grid
ggvenn(list(
  Control = cd4_res_control$genes,
  CeD = cd4_res_ced$genes
))

Inspecting the unique CeD genes

unique_ced_genes <- setdiff(cd4_res_ced$hgnc_symbol, cd4_res_control$hgnc_symbol)
print(unique_ced_genes)
  [1] "ARL6IP1"  "FLOT2"    "SARAF"    "GNL2"     "ITGB7"    "NCL"     
  [7] "CCT3"     "KIF22"    "EBNA1BP2" "CYTIP"    "LY6E"     "EIF5B"   
 [13] "MYADM"    "SLC1A5"   "PHB1"     "CCT8P1"   "VDAC1"    "PKM"     
 [19] "EMP3"     "ATIC"     "KPNB1"    "DEF6"     "CD3E"     "PIK3R1"  
 [25] "SH2D2A"   "PIM1"     "CD96"     "SRPRB"    "SIT1"     "TMSB4X"  
 [31] "LTV1"     "RSL1D1"   "ARHGAP4"  "HJURP"    "HMGB2"    "ZDHHC12" 
 [37] "CBX6"     "EED"      "SLC16A1"  "EMB"      "TMA16"    "APEX1"   
 [43] "CD59"     "POLR3K"   "CDK4"     "NOB1"     "NIPAL3"   "KLF13"   
 [49] "HMGCS1"   "TMSB4XP4" "CCT8"     "DCTPP1"   "ETV6"     "TWF2"    
 [55] "SAMD9"    "SOS1"     "SPTBN1"   "TFAM"     "MECP2"    "ATP2B1"  
 [61] "H1-10"    "TNRC6B"   "S100A11"  "H1-2"     "KNOP1"    "BMS1"    
 [67] "ARHGAP18" "CAPN1"    "IL27RA"   "CEP57"    "DOCK8"    "AP1S2"   
 [73] "FXYD5"    "GCN1"     "ADA"      "CKAP2"    "HSPA8"    "DDX1"    
 [79] "MYO1G"    "AKAP13"   "RNF167"   "LDHA"     "DCTN3"    "NTAN1"   
 [85] "HMMR"     "PDE3B"    "TMSB4XP8" "LSP1"     "UTP3"     "CASP8AP2"
 [91] "STAT3"    "MT2A"     "RRBP1"    "MZT2A"    "ZNF367"   "EIF6"    
 [97] "SKAP1"    "RPL7L1"   "GRN"      "PARP9"    "CASP4"    "SUSD3"   
[103] "NMT2"     "ANXA5"    "SRF"      "RPIA"     "LRCH4"    "CTSS"    
[109] "POLR2C"   "RHOC"     "TGFB1"    "NIN"      "DGKZ"     "KCTD17"  
[115] "DLGAP5"   "BAX"      "RNF149"   "DYNC2I2"  "RSRP1"    "JAK3"    
[121] "MIEF1"    "ZAP70"    "H4C3"     "FAM117A"  "MICAL2"   "ABR"     
[127] "ILK"      "SNHG8"    "MRPL15"   "DCP2"     "RRP7A"    "NBEAL2"  
[133] "HEBP2"    "MIER1"    "SLC25A44" "SLC7A1"   "PIK3R5"   "UBE2L5"  
[139] "PCBP4"    "GIMAP6"   "SCARNA9"  "MT-TL1"   "PPM1M"    "RPS28"   
[145] "MT-TP"    "SNX14"    "TIMM10"   "HDAC7"    "GYG1"     "SYTL2"   
# Heatmap with these genes
de_genes <- setdiff(cd4_res_ced$genes, cd4_res_control$genes)

# Choose samples to show - only CD4 T samples
samples <- y_filtered$samples |>
    filter(celltype == 'CD4')
samples_to_plot <- rownames(samples)

# Get the logCPM counts for these genes
cpm_de <- cpm(y_filtered, log = TRUE)
cpm_de <- cpm_de[de_genes, samples_to_plot]

# Calculates z-score
z <- t(scale(t(cpm_de)))

# Create annotation dataframe for the heatmap - this can be whatever you want to show about the samples (columns)
annotation_col <- y_filtered$samples[colnames(z), c('stimulation', 'condition')]
ann_colors <- list(
    condition = c('Control' = '#29d3d6',
                 'CeD' = '#910a0a'),
    stimulation = c('CD3_CD28_stimulation' = '#FEE5D9',
                   'unstimulated' = '#FCAE91'))
rownames(cd4_res_ced) <- cd4_res_ced$genes
annotation_row <- cd4_res_ced[rownames(z), c('genes', 'hgnc_symbol')]
rownames(z) <- annotation_row$hgnc_symbol

# Make the heatmap with values scaled by row (genes), so you can compare between samples (columns)
pheatmap(z, cluster_rows = T, cluster_cols = T, show_rownames = T, show_colnames = F, border_color = NA,
        color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(100),
        annotation_col = annotation_col,
        annotation_colors = ann_colors,
        treeheight_col = 5,
        fontsize = 7
        )

Are these unique genes enriched for something?

unique_data <- cd4_res_ced |> filter(hgnc_symbol %in% unique_ced_genes)
unique_data <- unique_data |> filter(!is.na(entrezgene_id))

# GSEA
unique_go <- run_gsea(unique_data, contrast = 'CD4_stim_CeD', database = 'GO-BP')
using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
preparing geneSet collections...
GSEA analysis...
no term enriched under specific pvalueCutoff...
unique_kegg <- run_gsea(unique_data, contrast = 'CD4_stim_CeD', database = 'KEGG')
using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
preparing geneSet collections...
GSEA analysis...
no term enriched under specific pvalueCutoff...
unique_pa <- run_gsea(unique_data, contrast = 'CD4_stim_CeD', database = 'Reactome')
using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
preparing geneSet collections...
GSEA analysis...
no term enriched under specific pvalueCutoff...
# ORA
universe <- results_cd4 |>
  filter(Contrast == 'CD4_stim_CeD' & !is.na(entrezgene_id)) |> pull(genes)

ora_go <- enrichGO(
  gene = unique_data$entrezgene_id,
  universe = universe,
  ont = "BP",
  keyType = 'ENTREZID',
  OrgDb = org.Hs.eg.db,
  pvalueCutoff = 0.05,
  readable = TRUE
)
No gene sets have size between 10 and 500 ...
--> return NULL...
ora_kegg <- enrichKEGG(
  gene = unique_data$entrezgene_id,
  universe = universe,
  pvalueCutoff = 0.05
)
No gene sets have size between 10 and 500 ...
--> return NULL...
ora_pa <- enrichPathway(
  gene = unique_data$entrezgene_id,
  universe = universe,
  pvalueCutoff = 0.05
)
No gene sets have size between 10 and 500 ...
--> return NULL...

No enrichment.

Inspecting the shared genes

Same as above, checking if they are enriched for something.

shared_genes <- intersect(cd4_res_ced$hgnc_symbol, cd4_res_control$hgnc_symbol)
print(shared_genes)
  [1] "CD52"       "ARID5A"     "S100A10"    "IL2RA"      "NFKBIA"    
  [6] "FURIN"      "GZMB"       "PIM2"       "HSP90AB1"   "S100A4"    
 [11] "RYBP"       "PIM3"       "ADD3"       "VDR"        "S1PR4"     
 [16] "BTG1"       "SPN"        "SLFN5"      "HSPD1"      "YPEL3"     
 [21] "MYO1F"      "ZFP36L2"    "RIPOR2"     "CDC25B"     "EVI2B"     
 [26] "BCL3"       "VIM"        "EVL"        "BCL9L"      "HSPA5"     
 [31] "MTHFD2"     "DUSP6"      "LIMA1"      "PPP2R5C"    "TMX4"      
 [36] "NPM3"       "SORL1"      "S100A6"     "JUNB"       "PPIF"      
 [41] "LAPTM5"     "SDC4"       "ADGRE5"     "PRKACB"     "SNHG3"     
 [46] "DDX21"      "SUN2"       "NFKB2"      "EPB41"      "HSPA4"     
 [51] "STAT1"      "RBL2"       "WARS1"      "GPATCH4"    "SIGIRR"    
 [56] "KLF2"       "ALOX5AP"    "STMN1"      "TFRC"       "SLCO3A1"   
 [61] "BATF"       "STK10"      "TLE5"       "CALM1"      "DUSP5"     
 [66] "HSP90AA1"   "PRMT1"      "PSMD11"     "NPM1"       "BIRC3"     
 [71] "DUSP4"      "CSF1"       "STK38"      "ARHGDIB"    "SEPTIN9"   
 [76] "NHERF1"     "H2AZ2"      "EEIG1"      "SLC27A2"    "IL7R"      
 [81] "CTLA4"      "IL16"       "MT-ND1"     "CYCS"       "CD3G"      
 [86] "RASGRP2"    "CD99"       "WDR43"      "ITGA4"      "LINC02481" 
 [91] "PLP2"       "CCND2"      "RASSF1"     "E2F2"       "PUM3"      
 [96] "PRELID1P6"  "NUSAP1"     "HSPH1"      "GTPBP4"     "SLC7A5"    
[101] "DSTN"       "PTPN18"     "YWHAG"      "ESF1"       "IL10RA"    
[106] "NME1"       "HSPE1P6"    "FOXP1"      "CCT6A"      "ARHGAP45"  
[111] "SH3BGRL3"   "MGST3"      "CNN2"       "BRD3"       "SRM"       
[116] "TCP1"       "PSMB5"      "PFDN2"      "TC2N"       "C16orf54"  
[121] "PTPRCAP"    "BIN2"       "NOP14"      "MRTO4"      "ESYT1"     
[126] "MANF"       "PYCARD"     "GLCCI1"     "LGALS1"     "TRAF1"     
[131] "CDKN1B"     "NOLC1"      "FGD3"       "ITM2B"      "CTDSP1"    
[136] "FYB1"       "SLIRP"      "ABLIM1"     "ATP2A2"     "CCT2"      
[141] "LITAF"      "GNL3"       "PBXIP1"     "DNAJA1"     "DOK2"      
[146] "SLC3A2"     "MTATP6P1"   "RPL22L1"    "HCST"       "CARHSP1"   
[151] "BCL11B"     "DKC1"       "CDKN2D"     "TIMM13"     "MCUB"      
[156] "BBC3"       "MXD4"       "ANXA1"      "MT-CO1"     "NOL11"     
[161] "PFKL"       "HDGFL3"     "ERP29"      "DESI1"      "UTP4"      
[166] "TMEM123"    "MT-ND2"     "TRIB1"      "RESF1"      "ETF1"      
[171] "CCT4"       "MARS1"      "CLUH"       "SSBP3"      "MSC"       
[176] "SHISA5"     "CD44"       "CXCR3"      "CDK6"       "TSC22D3"   
[181] "TES"        "SEC11C"     "GPSM3"      "P2RY8"      "RCSD1"     
[186] "EIF4G1"     "POLR3GL"    "RNF213"     "TUBA1A"     "PAICS"     
[191] "FARSB"      "RBMS1"      "ADAM19"     "HSPE1"      "PACS1"     
[196] "RAB11FIP1"  "SLC43A3"    "FLI1"       "PHLDA1"     "RAB8B"     
[201] "RNF166"     "NFKBIB"     "MT-ATP6"    "RNASET2"    "CDCA7"     
[206] "MAPK6"      "METRNL"     "GSTK1"      "KCTD7"      "VMP1"      
[211] "NFIL3"      "SLC39A1"    "SRPK2"      "IFNAR2"     "TBC1D10C"  
[216] "GNG2"       "AQP3"       "ATP2A3"     "PSMA3"      "LTB"       
[221] "CAMK4"      "ARHGAP15"   "PCED1B-AS1" "MT-ND6"     "IDH2"      
[226] "ZNRF1"      "ID2"        "CLEC2B"     "GLIPR1"     "SPC24"     
[231] "CAST"       "RAB29"      "RGS1"       "SLA2"       "CORO7"     
[236] "AHNAK"      "BCAT1"      "TRAPPC1"    "JAML"       "GIMAP7"    
[241] "ASPM"       "CHCHD10"    "XPO5"       "MT-ND4L"    "PCSK7"     
[246] "PRELID1"    "RASSF3"     "PREX1"      "IGBP1"      "COPRS"     
[251] "ARL4C"      "PLAAT4"     "BHLHE40"    "CTDSP2"     "PES1"      
[256] "CCM2"       "APBB1IP"    "PRDM1"      "PNRC1"      "STIP1"     
[261] "GLUL"       "ISG20"      "SIRPG"      "CD37"       "NUCB2"     
[266] "SH3BGRL"    "RAB3GAP1"   "RGS19"      "HERPUD2"    "GRAP2"     
[271] "FAM3C"      "NLRC3"      "TIMP1"      "STAT5B"     "MT-ND4"    
[276] "FLOT1"      "CLSTN1"     "RNPEPL1"    "PLEKHF1"    "IFITM2"    
[281] "MT-ND5"     "TMBIM1"     "ARHGEF1"    "NCOA3"      "GLG1"      
[286] "MYB"        "IDH3A"      "VEZF1"      "PIK3IP1"    "PTPN6"     
[291] "YBX3"       "ARHGAP25"   "ASNS"       "PDCD4"      "SMAD3"     
[296] "UBE2Q2"     "EVI2A"      "LAG3"       "ENTPD1"     "RDH10"     
[301] "ARHGEF3"    "JUN"        "BIN1"       "MT2P1"      "PFKFB3"    
[306] "CAPG"       "ZNF282"     "CLU"        "IL4R"       "GNA15"     
[311] "SLC44A1"    "APOBEC3H"   "ZFP36"      "PRKCQ"      "GIMAP4"    
# Heatmap with these genes
de_genes <- intersect(cd4_res_ced$genes, cd4_res_control$genes)

# Choose samples to show - only CD4 T samples
samples <- y_filtered$samples |>
    filter(celltype == 'CD4')
samples_to_plot <- rownames(samples)

# Get the logCPM counts for these genes
cpm_de <- cpm(y_filtered, log = TRUE)
cpm_de <- cpm_de[de_genes, samples_to_plot]

# Calculates z-score
z <- t(scale(t(cpm_de)))

# Create annotation dataframe for the heatmap - this can be whatever you want to show about the samples (columns)
annotation_col <- y_filtered$samples[colnames(z), c('stimulation', 'condition')]
ann_colors <- list(
    condition = c('Control' = '#29d3d6',
                 'CeD' = '#910a0a'),
    stimulation = c('CD3_CD28_stimulation' = '#FEE5D9',
                   'unstimulated' = '#FCAE91'))
rownames(cd4_res_ced) <- cd4_res_ced$genes
annotation_row <- cd4_res_ced[rownames(z), c('genes', 'hgnc_symbol')]
rownames(z) <- annotation_row$hgnc_symbol

# Make the heatmap with values scaled by row (genes), so you can compare between samples (columns)
pheatmap(z, cluster_rows = T, cluster_cols = T, show_rownames = T, show_colnames = F, border_color = NA,
        color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(100),
        annotation_col = annotation_col,
        annotation_colors = ann_colors,
        treeheight_col = 5,
        fontsize = 7
        )

shared_data <- cd4_res_ced |> filter(hgnc_symbol %in% shared_genes)
shared_data <- shared_data |> filter(!is.na(entrezgene_id))

# GSEA
shared_go <- run_gsea(shared_data, contrast = 'CD4_stim_CeD', database = 'GO-BP')
using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
preparing geneSet collections...
GSEA analysis...
leading edge analysis...
done...
shared_kegg <- run_gsea(shared_data, contrast = 'CD4_stim_CeD', database = 'KEGG')
using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
preparing geneSet collections...
GSEA analysis...
leading edge analysis...
done...
shared_pa <- run_gsea(shared_data, contrast = 'CD4_stim_CeD', database = 'Reactome')
using 'fgsea' for GSEA analysis, please cite Korotkevich et al (2019).
preparing geneSet collections...
GSEA analysis...
leading edge analysis...
done...
# plot gsea reactome
n <- 5
data_plot <- shared_go |>
  arrange(desc(abs(NES))) |>
  group_by(sign(NES)) |>
  dplyr::slice(1:n)

p <- ggplot(data_plot, showCategory = n*2, aes(NES, fct_reorder(Description,NES), fill = -log10(p.adjust))) +
  geom_col() +
  geom_vline(xintercept = 0, colour = '#696969', linetype = 'dashed') +
  scale_fill_gradientn(colours=c("#b3eebe", "#46bac2", "#371ea3"),
                        guide=guide_colorbar(reverse=TRUE)) +
  labs(
      x = 'Normalized Enrichment Score',
      y = '') +
  default_theme() +
  theme(text = element_text(size = 11))
print(p)

# ORA
universe <- results_cd4 |>
  filter(Contrast == 'CD4_stim_CeD' & !is.na(entrezgene_id)) |> pull(genes)

ora_go <- enrichGO(
  gene = shared_data$entrezgene_id,
  universe = universe,
  ont = "BP",
  keyType = 'ENTREZID',
  OrgDb = org.Hs.eg.db,
  pvalueCutoff = 0.05,
  readable = TRUE
)
No gene sets have size between 10 and 500 ...
--> return NULL...
ora_kegg <- enrichKEGG(
  gene = shared_data$entrezgene_id,
  universe = universe,
  pvalueCutoff = 0.05
)
No gene sets have size between 10 and 500 ...
--> return NULL...
ora_pa <- enrichPathway(
  gene = shared_data$entrezgene_id,
  universe = universe,
  pvalueCutoff = 0.05
)
No gene sets have size between 10 and 500 ...
--> return NULL...

Now checking the concordance between the log2FC in control and in CeD.

data_compare <- results_significant_cd4 |> 
  filter(hgnc_symbol %in% shared_genes & !is.na(hgnc_symbol)) |>
  dplyr::select(hgnc_symbol, logFC, Contrast) |>
  pivot_wider(names_from = Contrast, values_from = logFC)

# Create a column called ratioFC, which is the FC in CeD divided by the FC in Control
# 1 = same FC in both cases, > 1 = more deregulated in CeD, < 1 = more deregulated in CeD
data_compare$ratioFC <- data_compare$CD4_stim_CeD / data_compare$CD4_stim_control
threshold <- 1.4 # Highlight genes that have ratioFC > threshold
data_compare$genelabel <- if_else(data_compare$ratioFC > threshold, data_compare$hgnc_symbol, '')

ggplot(data_compare, aes(x = CD4_stim_control, y = CD4_stim_CeD)) +
  geom_point(alpha = 0.3, size = 2, color = if_else(data_compare$ratioFC > threshold, "blue", "darkgray")) +
  geom_abline(slope = 1, intercept = 0, linetype = 2) +
  geom_text_repel(aes(label = genelabel), force = 50, max.overlaps = 5000, min.segment.length = 0.1) +
  default_theme() +
  xlim(-5,5) +
  ylim(-5,5)

CD8 T cells

volcanos_cd8 <- list()
for(c in unique(results_cd8$Contrast)) {
    data <- dplyr::filter(results_cd8, Contrast == c)
    p <- VolcanoPlot(data, threshold = 1, p_cutoff = 0.05, n_labels = 15, title = c)
    volcanos_cd8 <- append(volcanos_cd8, list(p))
}
wrap_plots(volcanos_cd8[1:3], guides = 'collect', axis_titles = 'collect')
Warning: ggrepel: 1 unlabeled data points (too many overlaps). Consider
increasing max.overlaps
Warning: ggrepel: 2 unlabeled data points (too many overlaps). Consider
increasing max.overlaps

wrap_plots(volcanos_cd8[4:6], guides = 'collect', axis_titles = 'collect')
Warning: ggrepel: 4 unlabeled data points (too many overlaps). Consider
increasing max.overlaps
Warning: ggrepel: 5 unlabeled data points (too many overlaps). Consider increasing max.overlaps
ggrepel: 5 unlabeled data points (too many overlaps). Consider increasing max.overlaps

wrap_plots(volcanos_cd8[7:9], guides = 'collect', axis_titles = 'collect')
Warning: Removed 27 rows containing missing values or values outside the scale range
(`geom_text_repel()`).
Warning: ggrepel: 6 unlabeled data points (too many overlaps). Consider
increasing max.overlaps

wrap_plots(volcanos_cd8[10:11], guides = 'collect', axis_titles = 'collect')

How to interpret the comparisons:

Contrast Meaning
CD8_stim_combined Effect of stimulated CD4 T cells supernatant on CD8 cells (compared to basal media) - Global effect
CD8_unstim_combined Effect of unstimulated CD4 T cells supernatant on CD8 cells (compared to basal media) - Global effect
CD8_stim_vs_unstim_combined Effect of stimulated CD4 T cells supernatant on CD8 cells (compared to unstimulated) - Global effect
CD8_stim_control Effect of stimulated CD4 T cells supernatant on CD8 cells (compared to basal media) - Controls
CD8_unstim_control Effect of unstimulated CD4 T cells supernatant on CD8 cells (compared to basal media) - Controls
CD8_stim_vs_unstim_control Effect of stimulated CD4 T cells supernatant on CD8 cells (compared to unstimulated) - Controls
CD8_stim_CeD Effect of stimulated CD4 T cells supernatant on CD8 cells (compared to basal media) - CeD
CD8_unstim_CeD Effect of unstimulated CD4 T cells supernatant on CD8 cells (compared to basal media) - CeD
CD8_stim_vs_unstim_CeD Effect of stimulated CD4 T cells supernatant on CD8 cells (compared to unstimulated) - CeD
Diff_CD8_stim What is the difference between CD8 CeD + stimulated supernatant and control?
Diff_CD8_unstim What is the difference between CD8 CeD + unstimulated supernatant and control?

Focusing the following steps on the comparisons CD8 stim vs unstim, because I believe is more informative to see what is the effect of stimulated CD4 T cells on CD8 T cells, when compared to unstimulated CD4 (insted of the basal media). What DEGs are shared and unique to CeD and control?

cd8_res_control <- results_significant_cd8 |> filter(Contrast == 'CD8_stim_vs_unstim_control' & !is.na(hgnc_symbol)) |> distinct(genes, .keep_all = TRUE)
cd8_res_ced <- results_significant_cd8 |> filter(Contrast == 'CD8_stim_vs_unstim_CeD' & !is.na(hgnc_symbol)) |> distinct(genes, .keep_all = TRUE)

ggvenn(list(
  Control = cd8_res_control$genes,
  CeD = cd8_res_ced$genes
))

Looking at unique and shared genes

unique_genes <- setdiff(cd8_res_ced$hgnc_symbol, cd8_res_control$hgnc_symbol)
print("Unique genes to CeD:")
[1] "Unique genes to CeD:"
print(unique_genes)
 [1] "KDSR"    "VMP1"    "SLC27A2" "ADAM19"  "BATF"    "GINS2"   "HSPH1"  
 [8] "NFKBIB"  "NFKB2"   "IL2RA"   "SLC44A1"
shared_genes <- intersect(cd8_res_ced$hgnc_symbol, cd8_res_control$hgnc_symbol)
print("Shared genes:")
[1] "Shared genes:"
print(shared_genes)
 [1] "BCL3"    "PIM2"    "NFKBIA"  "DUSP5"   "PRMT1"   "TSC22D3" "GZMB"   
 [8] "CD52"    "JUNB"    "BIRC3"   "POU2F2"  "ZNRF1"  

Check the concordance for the shared genes:

data_compare <- results_significant_cd8 |> 
  filter(hgnc_symbol %in% shared_genes & !is.na(hgnc_symbol)) |>
  dplyr::select(hgnc_symbol, logFC, Contrast) |>
  pivot_wider(names_from = Contrast, values_from = logFC)

# Create a column called ratioFC, which is the FC in CeD divided by the FC in Control
# 1 = same FC in both cases, > 1 = more deregulated in CeD, < 1 = more deregulated in CeD
data_compare$ratioFC <- data_compare$CD8_stim_vs_unstim_CeD / data_compare$CD8_stim_vs_unstim_control
threshold <- 1.4 # Highlight genes that have ratioFC > threshold
data_compare$genelabel <- if_else(data_compare$ratioFC > threshold, data_compare$hgnc_symbol, '')

ggplot(data_compare, aes(x = CD8_stim_vs_unstim_control, y = CD8_stim_vs_unstim_CeD)) +
  geom_point(alpha = 0.3, size = 2, color = if_else(data_compare$ratioFC > threshold, "blue", "darkgray")) +
  geom_abline(slope = 1, intercept = 0, linetype = 2) +
  geom_text_repel(aes(label = genelabel), force = 50, max.overlaps = 5000, min.segment.length = 0.1) +
  default_theme() +
  xlim(-5,5) +
  ylim(-5,5)